import os
import conda
conda_file_dir = conda.__file__
conda_dir = conda_file_dir.split('lib')[0]
proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj')
os.environ["PROJ_LIB"] = proj_lib
from mpl_toolkits.basemap import Basemap
# package imports
#basics
import numpy as np
import pandas as pd
#misc
import gc
import time
import warnings
#viz
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import matplotlib.gridspec as gridspec
# graph viz
import plotly.offline as pyo
from plotly.graph_objs import *
import plotly.graph_objs as go
#map section
import imageio
import folium
import folium.plugins as plugins
from mpl_toolkits.basemap import Basemap
#graph section
import networkx as nx
import heapq # for getting top n number of things from list,dict
#settings
start_time=time.time()
color = sns.color_palette()
sns.set_style("dark")
warnings.filterwarnings("ignore")
pyo.init_notebook_mode()
%matplotlib inline
import simplejson as json
def readReview(path, business):
review=[]
reviewdf = pd.DataFrame.from_records(review)
with open(path + "review.json", 'rb') as f:
for l in f:
data = json.loads(l)
review.append(data)
if len(review) > 1000000:
df = pd.DataFrame.from_records(review)
df = pd.merge(df, business, on=['business_id'])
review = []
reviewdf = pd.concat([reviewdf, df])
df = pd.DataFrame.from_records(review)
df = pd.merge(df, business, on=['business_id'])
review = []
reviewdf = pd.concat([reviewdf, df])
return reviewdf
def readBusiness(path):
business=[]
for l in open(path + "business.json", 'rb').readlines():
business.append(json.loads(l))
businessdf = pd.DataFrame.from_records(business)
return businessdf
path = "/Users/xiaxun/Downloads/yelp_dataset/"
business = readBusiness(path)
business = business[business.state == "IL"]
business = business.drop(['stars'],axis=1)
review = readReview(path, business)
review.head(3)
review.info()
review.describe()
business.describe()
business.head()
business.info()
city = list(business['city'].unique())
city.sort()
city
# Data cleaning. Divide cities into counties.
business['city'] = business['city'].apply(lambda city : city.lower())
boolcham = business['city'].apply(lambda x: x.startswith('broad') or x.startswith('homer') or x.startswith('ivesdale'))
business.loc[boolcham,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('dewey') or x.startswith('fisher') or x.startswith('gifford'))
business.loc[boolcham,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('mahomet') or x.startswith('ogden') or x.startswith('philo'))
business.loc[boolcham,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('rantoul') or x.startswith('savoy') or x.startswith('sidney'))
business.loc[boolcham,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('thomasboro') or x.startswith('tolono') or x.startswith('sidney'))
business.loc[boolcham,'county'] = 'champaign'
boolurb = business['city'].apply(lambda x: x.startswith('urbana'))
business.loc[boolurb,'city'] = 'urbana'
business.loc[boolurb,'county'] = 'champaign'
boolstj = business['city'].apply(lambda x: x.endswith('joseph'))
business.loc[boolstj,'city'] = 'st joseph'
business.loc[boolstj,'county'] = 'champaign'
boolcham = business['city'].apply(lambda x: x.startswith('champaign'))
business.loc[boolcham,'county'] = 'champaign'
boolverm = business['city'].apply(lambda x: x.endswith('fithian'))
business.loc[boolverm,'county'] = 'vermilion'
boolpiatt = business['city'].apply(lambda x: x.endswith('mansfield') or x.endswith('monticello'))
business.loc[boolpiatt,'county'] = 'piatt'
boolcook = business['city'].apply(lambda x: x.endswith('schaumburg'))
business.loc[boolcook,'county'] = 'cook'
booldouglas = business['city'].apply(lambda x: x.endswith('tuscola') or x.startswith('villa'))
business.loc[booldouglas,'county'] = 'douglas'
boolwill = business['city'].apply(lambda x: x.endswith('joliet'))
business.loc[boolwill,'county'] = 'will'
business.head()
review['text length'] = review[['text']].applymap(lambda x: len(x.split(" ")))
# transfer text into lower case
review['text1']=review['text'].apply(lambda sen:" ".join(x.lower() for x in sen.split()))
review[['text','text1']].head()
# delete the punctuation in the text
review['text2']=review['text1'].str.replace('[^\w\s]','')
review[['text1','text2']].head()
# delete the stop words in text.
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=stopwords.words('english')
review['text3']=review['text2'].apply(lambda sen:" ".join(x for x in sen.split() if x not in stop))
review[['text2','text3']].head()
# delete scarce words in the text
freq = pd.Series(' '.join(review['text3']).split()).value_counts()[-10:]
freq = list(freq.index)
review['text4']=review['text3'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
review[['text3','text4']].head()
Spell check
from textblob import TextBlob
review['text4'][:5].apply(lambda x: str(TextBlob(x).correct()))
Lemmatization
import nltk
nltk.download('wordnet')
from textblob import Word
review['text5'] = review['text4'].apply(lambda x:" ".join([Word(word).lemmatize() for word in x.split()]))
review[['text4','text5']]
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(lowercase=True, analyzer='word', stop_words= 'english')
tfidf_matrix = tf.fit_transform(review['text5'])
tfidf = dict(zip(tf.get_feature_names(), tf.idf_))
tfidf = pd.DataFrame(columns = ['tfidf']).from_dict(dict(tfidf), orient = 'index')
tfidf.columns = ['tfidf']
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text = tfidf.sort_values(by = ['tfidf'], ascending = True).head(50)
text = text.index
text = ' '.join(list(text))
wordcloud = WordCloud()
wordcloud.generate(text)
plt.figure(figsize = (25,25))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#Use FacetGrid from the seaborn library to create a grid of 5 histograms
#of text length based off of the star ratings.
#Reference the seaborn documentation for hints on this
sns.set_style('white')
g = sns.FacetGrid(review, col='stars')
g.map(plt.hist,'text length')
#Create a boxplot of text length for each star category
sns.boxplot(x='stars', y='text length', data=review,palette='rainbow')
#Create a countplot of the number of occurrences for each type of star rating.
sns.countplot(x='stars',data=review,palette='rainbow')
#review = review.drop(['latitude','longitude'],axis=1)
#review_ = review.drop(['review_count','is_open'],axis=1)
stars = review.groupby('stars').mean()
stars
stars.corr()
sns.heatmap(stars.corr(), cmap='coolwarm',annot=True)
review amounts of different counties in IL
sns.countplot(x = 'county', data = business)
Compare business in Champaign, Piatt, Douglas and Cook.(Evanston locates in Cook)
rating_data=review[['latitude','longitude','stars','review_count']]
# Creating a custom column popularity using stars*no_of_reviews
rating_data['popularity']=rating_data['stars']*rating_data['review_count']
f, axes = plt.subplots(2, 2, figsize=(15,7))
#a random point inside Champaign
lat = 40.133919
lon = -88.248628
#some adjustments to get the right pic
lon_min, lon_max = lon-0.1,lon+0.3
lat_min, lat_max = lat-0.2,lat+0.3
#subset for Champaign
ratings_data_champion=rating_data[(rating_data["longitude"]>lon_min) &\
(rating_data["longitude"]<lon_max) &\
(rating_data["latitude"]>lat_min) &\
(rating_data["latitude"]<lat_max)]
#Facet scatter plot
ratings_data_champion.plot(kind='scatter', x='longitude', y='latitude',
color='yellow',
s=.02, alpha=.6, subplots=True, ax=axes[0,0])
axes[0,0].set_title("Champaign")
axes[0,0].set_facecolor('black')
#a random point inside piatt
lat = 40.035074
lon = -88.569312
#some adjustments to get the right pic
lon_min, lon_max = lon-0.1,lon+0.3
lat_min, lat_max = lat-0.2,lat+0.3
#subset for piatt
ratings_data_piatt=rating_data[(rating_data["longitude"]>lon_min) &\
(rating_data["longitude"]<lon_max) &\
(rating_data["latitude"]>lat_min) &\
(rating_data["latitude"]<lat_max)]
#plot piatt
ratings_data_piatt.plot(kind='scatter', x='longitude', y='latitude',
color='yellow',
s=.02, alpha=.6, subplots=True, ax=axes[0,1])
axes[0,1].set_title("Piatt")
axes[0,1].set_facecolor('black')
#a random point inside douglas
lat = 39.821199
lon = -88.246201
#some adjustments to get the right pic
lon_min, lon_max = lon-0.1,lon+0.01
lat_min, lat_max = lat-0.2,lat+0.01
#subset for dougles
ratings_data_dougles=rating_data[(rating_data["longitude"]>lon_min) &\
(rating_data["longitude"]<lon_max) &\
(rating_data["latitude"]>lat_min) &\
(rating_data["latitude"]<lat_max)]
#plot douglas
ratings_data_dougles.plot(kind='scatter', x='longitude', y='latitude',
color='yellow',
s=.02, alpha=.6, subplots=True, ax=axes[1,0])
axes[1,0].set_title("Douglas")
axes[1,0].set_facecolor('black')
#a random point inside cook
lat = 33.639053
lon = -112.208011
#some adjustments to get the right pic
lon_min, lon_max = lon-0.1,lon+0.3
lat_min, lat_max = lat-0.2,lat+0.3
#subset for cook
ratings_data_cook=rating_data[(rating_data["longitude"]>lon_min) &\
(rating_data["longitude"]<lon_max) &\
(rating_data["latitude"]>lat_min) &\
(rating_data["latitude"]<lat_max)]
#plot cook
ratings_data_cook.plot(kind='scatter', x='longitude', y='latitude',
color='yellow',
s=.02, alpha=.6, subplots=True, ax=axes[1,1])
axes[1,1].set_title("Cook")
axes[1,1].set_facecolor('black')
plt.tight_layout(pad=1.5)
f.show()
Lets take a view of how people rated different businesses in Champaign. The following is an interactive Animation, where we use the awesome Folium package to create stunning Leaflet map visuals. Here, in this animation, we are showing the highlighting businesses based on their Star ratings. The intention was to see if there are certain hotspots/concentrations where there are awesome Restaurants. It turns out good and bad businesses are peppered around the city quite evenly.
data=[]
#rearranging data to suit the format needed for folium
stars_list=list(rating_data['stars'].unique())
for star in stars_list:
subset=ratings_data_champion[ratings_data_champion['stars']==star]
data.append(subset[['latitude','longitude']].values.tolist())
#initialize at champaign
lat = 40.133919
lon = -88.248628
zoom_start=11
print(" Champaign Review heatmap Animation ")
# basic map
m = folium.Map(location=[lat, lon], tiles="OpenStreetMap", zoom_start=zoom_start)
#inprovising the Heatmapwith time plugin to show variations across star ratings
hm = plugins.HeatMapWithTime(data,max_opacity=0.3,auto_play=True,display_index=True,radius=20)
hm.add_to(m)
m
x=business.categories.value_counts()
print("In IL, there are ",len(x)," different types/categories of Businesses in Yelp!")
x=x.sort_values(ascending=False)
x=x.iloc[0:20]
plt.figure(figsize=(16,4))
ax = sns.barplot(x.index, x.values, alpha=0.5)#,color=color[5])
plt.title("What are the top categories?",fontsize=25)
locs, labels = plt.xticks()
plt.setp(labels, rotation=80)
plt.ylabel('# businesses', fontsize=12)
plt.xlabel('Category', fontsize=12)
#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show()
business.name.value_counts().index[:10].tolist()
#Plotting top 25 most reviewed businesses among all categories
ax = sns.catplot(x="review_count", y="name",data= business.nlargest(10,'review_count'),
kind="bar",hue= "categories", dodge= False, height= 10 )
plt.subplots_adjust(top=0.9)
ax.fig.suptitle('Top 10 Most Reviewed Businesses And Categories Lables Used') # can also get the figure from plt.gcf()
business['Num_Keywords'] = business['categories'].str.len()
#Top 20 categories with most keyword
business[['categories','Num_Keywords']].sort_values('Num_Keywords',ascending = False).head(10)
We can see that some businesses are using as much as 37 keywords in their categories. Let's look at some of these businesses.
How about the overall distribution of the number of keywords used by businesses? Let's discern that information.
fig = plt.figure()
ax = fig.add_subplot(111)
x = business['Num_Keywords']
numBins = 100
ax.hist(x,numBins,color='green',alpha=0.7)
plt.show()
NLP Classification Task
review_class = review
X_ = review_class['text']
y_ = review_class['stars']
X = X_
y = y_
index_1 = y_==1
y_tmp1 = y_[index_1]
x_tmp1 = X_[index_1]
index_2 = y_==2
y_tmp2 = y_[index_2]
x_tmp2 = X_[index_2]
index_3 = y_==3
y_tmp3 = y_[index_3]
x_tmp3 = X_[index_3]
index_4 = y_==4
y_tmp4= y_[index_4]
x_tmp4 = X_[index_4]
X = pd.concat([X_,x_tmp1,x_tmp2,x_tmp2,x_tmp2,x_tmp3,x_tmp3,x_tmp4])
y = pd.concat([y_,y_tmp1,y_tmp2,y_tmp2,y_tmp2,y_tmp3,y_tmp3, y_tmp4])
#Import CountVectorizer and create a CountVectorizer object
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train,y_train)
predictions = nb.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))
Using Text Processing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('bow', TfidfVectorizer(lowercase=True, analyzer='word', stop_words= 'english')),
('tfidf', TfidfTransformer()),
('classifier', MultinomialNB())
])
X_ = review_class['text5']
y_ = review_class['stars']
X = X_
y = y_
index_1 = y_==1
y_tmp1 = y_[index_1]
x_tmp1 = X_[index_1]
index_2 = y_==2
y_tmp2 = y_[index_2]
x_tmp2 = X_[index_2]
index_3 = y_==3
y_tmp3 = y_[index_3]
x_tmp3 = X_[index_3]
index_4 = y_==4
y_tmp4= y_[index_4]
x_tmp4 = X_[index_4]
X = pd.concat([X_,x_tmp1,x_tmp2,x_tmp2,x_tmp2,x_tmp3,x_tmp3,x_tmp4])
y = pd.concat([y_,y_tmp1,y_tmp2,y_tmp2,y_tmp2,y_tmp3,y_tmp3, y_tmp4])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
pipeline.fit(X_train,y_train)
predictions = pipeline.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))